In [326]:
import numpy as np
import pandas as pa
import matplotlib.pyplot as plt
from sklearn import linear_model
In [327]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int,
'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float,
'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int,
'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
regressionDir = '/media/weenkus/The Beast/Programming/Workspace/Projects/Machine-Learning-University-of-Washington/'
In [328]:
house_train = pa.read_csv(regressionDir + 'Regression/datasets/wk3_kc_house_train_data.csv', dtype = dtype_dict)
house_valid = pa.read_csv(regressionDir + 'Regression/datasets/wk3_kc_house_valid_data.csv', dtype = dtype_dict)
house_test = pa.read_csv(regressionDir + 'Regression/datasets/wk3_kc_house_test_data.csv', dtype = dtype_dict)
house_set1 = pa.read_csv(regressionDir + 'Regression/datasets/wk3_kc_house_set_1_data.csv', dtype = dtype_dict)
house_set2 = pa.read_csv(regressionDir + 'Regression/datasets/wk3_kc_house_set_2_data.csv', dtype = dtype_dict)
house_set3 = pa.read_csv(regressionDir + 'Regression/datasets/wk3_kc_house_set_3_data.csv', dtype = dtype_dict)
house_set4 = pa.read_csv(regressionDir + 'Regression/datasets/wk3_kc_house_set_4_data.csv', dtype = dtype_dict)
sales = pa.read_csv(regressionDir + 'Regression/datasets/kc_house_data.csv', dtype = dtype_dict)
In [329]:
house_train.head()
Out[329]:
In [330]:
# Show plots in jupyter
%matplotlib inline
plt.scatter(house_train.price, house_train.bedrooms, alpha=0.5)
plt.ylabel('price')
plt.xlabel('bedrooms')
plt.show()
In [331]:
plt.scatter(house_train.price, house_train.sqft_living, alpha=0.5)
plt.ylabel('price')
plt.xlabel('sqft_living')
plt.show()
In [332]:
plt.scatter(house_train.price, house_train.zipcode, alpha=0.5)
plt.ylabel('price')
plt.xlabel('zipcode')
plt.show()
In [333]:
def polynomial_dataframe(feature, degree): # feature is pandas.Series type
# assume that degree >= 1
# initialize the dataframe:
poly_dataframe = pa.DataFrame()
# and set poly_dataframe['power_1'] equal to the passed feature
poly_dataframe['power_1'] = feature
# first check if degree > 1
if degree > 1:
# then loop over the remaining degrees:
for power in range(2, degree+1):
# first we'll give the column a name:
name = 'power_' + str(power)
# assign poly_dataframe[name] to be feature^power; use apply(*)
poly_dataframe[name] = feature;
poly_dataframe[name] = poly_dataframe[name].apply(lambda x: x**power)
return poly_dataframe
In [334]:
sales = sales.sort(['sqft_living','price'])
sales.head()
Out[334]:
In [335]:
poly1_data = polynomial_dataframe(sales['sqft_living'], 1)
poly1_data['price'] = sales['price']
In [336]:
poly1_data.head()
Out[336]:
In [337]:
model1 = linear_model.LinearRegression()
model1.fit(poly1_data[['power_1']], poly1_data['price'])
Out[337]:
In [338]:
plt.plot(poly1_data.power_1,poly1_data.price, '.',
poly1_data[['power_1']], model1.predict(poly1_data[['power_1']]),'-')
plt.ylabel('House price')
plt.xlabel('Sqft')
Out[338]:
In [339]:
poly3_data = polynomial_dataframe(sales['sqft_living'], 3)
poly3_data['price'] = sales['price']
In [340]:
poly3_data.head() # third polynomial
Out[340]:
In [341]:
model2 = linear_model.LinearRegression()
model2.fit(poly3_data[['power_2']], poly3_data['price'])
model3 = linear_model.LinearRegression()
model3.fit(poly3_data[['power_3']], poly3_data['price'])
Out[341]:
In [342]:
plt.plot(poly3_data[['power_2']], model2.predict(poly3_data[['price']]),'-')
plt.ylabel('House price')
plt.xlabel('Sqft polynomial degree 2')
Out[342]:
In [343]:
print ('Model2: ', model2.coef_)
In [344]:
plt.plot(poly3_data[['power_3']], model3.predict(poly3_data[['price']]),'-')
plt.ylabel('House price')
plt.xlabel('Sqft polynomial degree 3')
Out[344]:
In [345]:
print ('Model3: ', model3.coef_)
In [346]:
poly15_set1_data = polynomial_dataframe(house_set1['sqft_living'], 15)
poly15_set1_data['price'] = house_set1['price']
poly15_set2_data = polynomial_dataframe(house_set2['sqft_living'], 15)
poly15_set2_data['price'] = house_set2['price']
poly15_set3_data = polynomial_dataframe(house_set3['sqft_living'], 15)
poly15_set3_data['price'] = house_set3['price']
poly15_set4_data = polynomial_dataframe(house_set4['sqft_living'], 15)
poly15_set4_data['price'] = house_set4['price']
In [347]:
poly15_set1_data.head()
Out[347]:
In [348]:
model_poly15_set1 = linear_model.LinearRegression()
model_poly15_set1.fit(poly15_set1_data[['power_15']], poly15_set1_data['price'])
model_poly15_set2 = linear_model.LinearRegression()
model_poly15_set2.fit(poly15_set2_data[['power_15']], poly15_set2_data['price'])
model_poly15_set3 = linear_model.LinearRegression()
model_poly15_set3.fit(poly15_set3_data[['power_15']], poly15_set3_data['price'])
model_poly15_set4 = linear_model.LinearRegression()
model_poly15_set4.fit(poly15_set4_data[['power_15']], poly15_set4_data['price'])
Out[348]:
In [349]:
plt.plot(poly15_set1_data.power_15 ,poly15_set1_data.price, '.',
poly15_set1_data[['power_15']], model_poly15_set1.predict(poly15_set1_data[['price']]),'-')
plt.ylabel('House price')
plt.xlabel('Sqft polynomial degree 15')
Out[349]:
In [350]:
#pint ('Model for set1: ', model_poly15_set1.coef_)
In [351]:
plt.plot(poly15_set2_data.power_15 ,poly15_set2_data.price, '.',
poly15_set2_data[['power_15']], model_poly15_set2.predict(poly15_set2_data[['price']]),'-')
plt.ylabel('House price')
plt.xlabel('Sqft polynomial degree 15')
Out[351]:
In [352]:
print ('Model for set12: ', model_poly15_set2.coef_)
In [353]:
plt.plot(poly15_set3_data.power_15 ,poly15_set3_data.price, '.',
poly15_set3_data[['power_15']], model_poly15_set3.predict(poly15_set3_data[['price']]),'-')
plt.ylabel('House price')
plt.xlabel('Sqft polynomial degree 15')
Out[353]:
In [354]:
print ('Model for set3: ', model_poly15_set3.coef_)
In [355]:
plt.plot(poly15_set4_data.power_15 ,poly15_set4_data.price, '.',
poly15_set4_data[['power_15']], model_poly15_set4.predict(poly15_set4_data[['price']]),'-')
plt.ylabel('House price')
plt.xlabel('Sqft polynomial degree 15')
Out[355]:
In [356]:
print ('Model for set4: ', model_poly15_set4.coef_)
In [357]:
# Engineering the test set
poly_data_test = polynomial_dataframe(house_test['sqft_living'], 15)
poly_data_test['price'] = house_test['price']
poly_data_test.head()
# Engineering the validation set
poly_data_validation = polynomial_dataframe(house_valid['sqft_living'], 15)
poly_data_validation['price'] = house_valid['price']
poly_data_validation.head()
Out[357]:
In [358]:
import sys
index = ['power_1','power_2','power_3','power_4','power_5','power_6','power_7','power_8','power_9',
'power_10','power_11','power_12','power_13','power_14','power_15']
for power in range(1, 16):
name = 'power_' + str(power)
# Build a data set
poly_data_training = polynomial_dataframe(house_train['sqft_living'], power)
poly_data_training['price'] = house_train['price']
# Build a model and fit it using the training data
model = linear_model.LinearRegression()
model.fit(poly_data_training[index[0:power]], poly_data_training['price'])
# Compute the RSS on the test set
RSS = ((model.predict(poly_data_validation[index[0:power]]) - poly_data_validation.price) ** 2).sum()
print('The RSS for ', power,'th degree polynomial: ', RSS)
if(power == 1):
min = RSS
# Save the min RSS
if(RSS < min):
min = RSS
minPower = power
optimalModel = model
In [359]:
print ('The minimum RSS: ', min, ' for the ', minPower,'th degree polynomial')
In [360]:
name = 'power_' + str(minPower)
print("Test RSS: %.2f" % ((optimalModel.predict(poly_data_test[index[0:minPower]]) - poly_data_test['price']) ** 2).sum())
In [ ]: